##6 Dec 2017

###############Sequence Analysis MP Career####################
##############################################################

setwd("C:/Users/matiav/Documents/Conference and Publications/Research and Politics 1/1R&R/sequence analysis")

#install.packages("readstata13")
#install.packages("TraMineR")
#install.packages("foreign")
#install.packages("cluster")
#install.packages("nnet")
#install.packages("texreg")
#install.packages("dcolumn")
#install.packages("booktabs")
#install.packages("ggplot2")
#install.packages("MASS")
#install.packages("Hmisc")
#install.packages("reshape2")
#install.packages("TraMineRextras")
#install.packages("readxl")
#install.packages("plm")
#install.packages("lmtest")
#install.packages("WeightedCluster")
#install.packages("dcolumn")
#install.packages("booktabs")
#install.packages("Rgraphviz")
#install.packages("gower")
#install.packages("Rtsne")
#install.packages("dendextend")
#install.packages("factoextra ")

library(factoextra)
library(dendextend)
library(dplyr) # for data cleaning
library(cluster) # for gower similarity and pam
library(Rtsne) # for t-SNE plot
library(ggplot2) # for visualization
library(gower)
library(readstata13)
library(TraMineR)
library(foreign)
library(cluster)
library(nnet)
library(texreg)
library(dcolumn)
library(booktabs)
require(foreign)
require(ggplot2)
require(MASS)
require(Hmisc)
require(reshape2)
library(WeightedCluster)
library(Rgraphviz)  
library(TraMineRextras)  
library(readxl)  
library(plyr)
library(plm)
library(lmtest)
library(futile.logger)

MPCareer = read_excel("dataset 27_8_14 checked 10_5_16.xlsx")

print(MPCareer)

MPCareer <- MPCareer[MPCareer$party_n != 3, ]

for(level in unique(MPCareer$legislature)){         #create dummies for legislature
  MPCareer[paste("dummyleg", level, sep = "_")] <- ifelse(MPCareer$legislature == level, 1, 0)
}

for(level in unique(MPCareer$party_n)){         #create dummies for party
  MPCareer[paste("dummyparty", level, sep = "_")] <- ifelse(MPCareer$party_n == level, 1, 0)
}

for(level in unique(MPCareer$uni)){         #create dummies for uni
  MPCareer[paste("dummyuni", level, sep = "_")] <- ifelse(MPCareer$uni == level, 1, 0)
}


MPCareer.inside.labels <- c("Backbench", "Frontbench", "Great Office")
MPCareer.inside.scode <- c(0,1,2)
MPCareer.inside.seq <- seqdef(MPCareer, 28:45, states = MPCareer.inside.scode,labels=MPCareer.inside.labels, missing.color="white", left="DEL")

pal <- grey(0:8/8)

###distance matrix

submat <- seqsubm(MPCareer.inside.seq, method = "TRATE", with.missing=FALSE)

dist.om1 <- seqdist(MPCareer.inside.seq, method = "OM",sm = submat, with.missing=TRUE)

###clustering - 5 clusters

wardClust <- hclust(as.dist(dist.om1), method = "ward.D")

wardRange1 <- as.clustrange(wardClust, diss=dist.om1, ncluster = 5)

summary(wardRange1, max.rank = 5)

MPCareer$clust5 <- factor(wardRange1$clustering$cluster5, levels = c(1, 2, 3, 4, 5), labels = c("Career 1", "Career 2", "Career 3", "Career 4", "Career 5"))

png("Figure1.png",width = 10, height = 8, units = 'in', res = 1000)

seqdplot(MPCareer.inside.seq, group = MPCareer$clust5, border = NA, withlegend = T) 


dev.off()


png("Figure1bw.png",width = 10, height = 8, units = 'in', res = 1000)

seqdplot(MPCareer.inside.seq, group = MPCareer$clust5, border = NA, withlegend = T, cpal=pal) 


dev.off()

###clustering - 4 clusters

wardRange2 <- as.clustrange(wardClust, diss=dist.om1, ncluster = 4) 
summary(wardRange2, max.rank = 10)
MPCareer$clust4 <- factor(wardRange2$clustering$cluster4, levels = c(1, 2, 3, 4), labels = c("Career 1", "Career 2", "Career 3", "Career 4"))

png("Figure2.png",width = 10, height = 8, units = 'in', res = 1000)

seqdplot(MPCareer.inside.seq, group = MPCareer$clust4, border = NA, withlegend = T)
dev.off()


png("Figure2bw.png",width = 10, height = 8, units = 'in', res = 1000)

seqdplot(MPCareer.inside.seq, group = MPCareer$clust4, border = NA, withlegend = T, cpal=pal)
dev.off()

###clustering - 6 clusters

wardRange3 <- as.clustrange(wardClust, diss=dist.om1, ncluster = 6) 
summary(wardRange3, max.rank = 6)
MPCareer$clust6 <- factor(wardRange3$clustering$cluster6, levels = c(1, 2, 3, 4, 5, 6), labels = c("Career 1", "Career 2", "Career 3", "Career 4", "Career 5", "Career 6"))

png("Figure3.png",width = 10, height = 8, units = 'in', res = 1000)

seqdplot(MPCareer.inside.seq, group = MPCareer$clust6, border = NA, withlegend = T)

dev.off()


png("Figure3bw.png",width = 10, height = 8, units = 'in', res = 1000)

seqdplot(MPCareer.inside.seq, group = MPCareer$clust6, border = NA, withlegend = T, cpal=pal)

dev.off()

###clustering - 3 clusters


wardRange4 <- as.clustrange(wardClust, diss=dist.om1, ncluster = 3) 
summary(wardRange4, max.rank = 10)
MPCareer$clust3 <- factor(wardRange1$clustering$cluster3, levels = c(1, 2, 3), labels = c("Career 1", "Career 2", "Career 3"))

png("Figure4.png",width = 10, height = 8, units = 'in', res = 1000)

seqdplot(MPCareer.inside.seq, group = MPCareer$clust3, border = NA, withlegend = T)

dev.off()


png("Figure4bw.png",width = 10, height = 8, units = 'in', res = 1000)

seqdplot(MPCareer.inside.seq, group = MPCareer$clust3, border = NA, withlegend = T, cpal=pal)

dev.off()

###clustering - 7 clusters

wardRange5 <- as.clustrange(wardClust, diss=dist.om1, ncluster = 7)
summary(wardRange5, max.rank = 7)
MPCareer$clust7 <- factor(wardRange5$clustering$cluster7, levels = c(1, 2, 3, 4, 5, 6, 7), labels = c("Career 1", "Career 2", "Career 3", "Career 4", "Career 5", "Career 6", "Career 7"))

png("Figure5.png",width = 10, height = 8, units = 'in', res = 1000)

seqdplot(MPCareer.inside.seq, group = MPCareer$clust7, border = NA, withlegend = T)

dev.off()


png("Figure5bw.png",width = 10, height = 8, units = 'in', res = 1000)

seqdplot(MPCareer.inside.seq, group = MPCareer$clust7, border = NA, withlegend = T, cpal=pal)

dev.off()


###clustering - 8 clusters

wardRange6 <- as.clustrange(wardClust, diss=dist.om1, ncluster = 8) 
summary(wardRange6, max.rank = 20)
MPCareer$clust8 <- factor(wardRange6$clustering$cluster8, levels = c(1, 2, 3, 4, 5, 6, 7, 8), labels = c("Career 1", "Career 2", "Career 3", "Career 4", "Career 5", "Career 6", "Career 7", "Career 8"))

png("Figure6.png",width = 10, height = 8, units = 'in', res = 1000)

seqdplot(MPCareer.inside.seq, group = MPCareer$clust8, border = NA, withlegend = T)

dev.off()


png("Figure6bw.png",width = 10, height = 8, units = 'in', res = 1000)

seqdplot(MPCareer.inside.seq, group = MPCareer$clust8, border = NA, withlegend = T, cpal=pal)

dev.off()


##Discrepancy Analyis

set.seed(1)


print(dissmfacw(dist.om1 ~ gender + state_school + job + 
                  age_at_entry + byelections + dummyleg_1+ dummyleg_2+ dummyleg_3+dummyuni_1+ dummyuni_2+dummyparty_1+ dummyparty_2, data=MPCareer, R=200))

###

library(foreign)
write.dta(MPCareer, "C:/Users/matiav/Documents/Conference and Publications/Research and Politics 1/1R&R/sequence analysis/MPCareer.dta")
###

###Oxbridge
MPCareer_uni <- MPCareer[ which(MPCareer$uni!=0), ]

MPCareer_uni$Oxbridge<-1
MPCareer_uni$Oxbridge[is.na(MPCareer_uni$`Oxbridge college1`)&is.na(MPCareer_uni$`Oxbridge college2`)&is.na(MPCareer_uni$`Oxbridge college3`)] <- 0

table(MPCareer_uni$Oxbridge)

###

library(foreign)
write.dta(MPCareer_uni, "C:/Users/matiav/Documents/Conference and Publications/Research and Politics 1/1R&R/sequence analysis/MPCareer_uni.dta")
###

###Hierarchical Clustering Alone
set.seed(1234) 

library(data.table)

MPCareer_clust <- MPCareer[c(1,2,3,28:45)]

setnames(MPCareer_clust, old=c("1",  "2",  "3",  "4",  "5",  "6",  "7",  "8"  ,"9",  "10", "11", "12", "13", "14" ,"15" ,"16", "17", "18"), new=c("s1",  "s2",  "s3",  "s4",  "s5",  "s6",  "s7",  "s8"  ,"s9",  "s10", "s11", "s12", "s13", "s14" ,"s15" ,"s16", "s17", "s18"))

MPCareer_clust$s1[is.na(MPCareer_clust$s1)]<- 99
MPCareer_clust$s2[is.na(MPCareer_clust$s2)]<- 99
MPCareer_clust$s3[is.na(MPCareer_clust$s3)]<- 99
MPCareer_clust$s4[is.na(MPCareer_clust$s4)]<- 99
MPCareer_clust$s5[is.na(MPCareer_clust$s5)]<- 99
MPCareer_clust$s6[is.na(MPCareer_clust$s6)]<- 99
MPCareer_clust$s7[is.na(MPCareer_clust$s7)]<- 99
MPCareer_clust$s8[is.na(MPCareer_clust$s8)]<- 99
MPCareer_clust$s9[is.na(MPCareer_clust$s9)]<- 99
MPCareer_clust$s10[is.na(MPCareer_clust$s10)]<- 99
MPCareer_clust$s11[is.na(MPCareer_clust$s11)]<- 99
MPCareer_clust$s12[is.na(MPCareer_clust$s12)]<- 99
MPCareer_clust$s13[is.na(MPCareer_clust$s13)]<- 99
MPCareer_clust$s14[is.na(MPCareer_clust$s14)]<- 99
MPCareer_clust$s15[is.na(MPCareer_clust$s15)]<- 99
MPCareer_clust$s16[is.na(MPCareer_clust$s16)]<- 99
MPCareer_clust$s17[is.na(MPCareer_clust$s17)]<- 99
MPCareer_clust$s18[is.na(MPCareer_clust$s18)]<- 99


fviz_nbclust(MPCareer_clust[c(4:21)], hcut, method = "wss") +
  geom_vline(xintercept = 4, linetype = 2)


##
clusters <- hclust(dist(MPCareer_clust[c(3:20)],method = "euclidean"))

clusterCut <- cutree(clusters, 8)

MPCareer_clust$newclust8<-clusterCut

##

MPCareer_merged<-merge(MPCareer, MPCareer_clust, by=c("id"), all.x=TRUE) 

tbl<-table(MPCareer_merged$clust8, MPCareer_merged$newclust8)

chisq.test(tbl) 

